package com.shujia.core

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo7GroupBy {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("groupBy算子演示")
    val context = new SparkContext(conf)
    //====================================================
    val studentRDD: RDD[String] = context.textFile("spark/data/students.csv")

    val splitRDD: RDD[Array[String]] = studentRDD.map((s: String) => s.split(","))

    //需求：求出每个班级平均年龄
    //使用模式匹配的方式取出班级和年龄
    val clazzWithAgeRDD: RDD[(String, Int)] = splitRDD.map {
      case Array(_, _, age: String, _, clazz: String) => (clazz, age.toInt)
    }

    /**
     *  groupBy：按照指定的字段进行分组，返回的是一个键是分组字段，值是一个存放原本数据的迭代器的键值对 返回的是kv格式的RDD
     *
     *  key: 是分组字段
     *  value: 是spark中的迭代器
     *  迭代器中的数据，不是完全被加载到内存中计算，迭代器只能迭代一次
     *
     *  groupBy会产生shuffle
     */
    //按照班级进行分组
    //val stringToStudents: Map[String, List[Student]] = stuList.groupBy((s: Student) => s.clazz)
    val kvRDD: RDD[(String, Iterable[(String, Int)])] = clazzWithAgeRDD.groupBy(_._1)
    val clazzAvgAgeRDD: RDD[(String, Double)] = kvRDD.map {
      case (clazz: String, itr: Iterable[(String, Int)]) =>
        //CompactBuffer((理科二班,21), (理科二班,23), (理科二班,21), (理科二班,23), (理科二班,21), (理科二班,21), (理科二班,24))
        //CompactBuffer(21,23,21,23,21,21,24)
        val allAge: Iterable[Int] = itr.map((kv: (String, Int)) => kv._2)
        val avgAge: Double = allAge.sum.toDouble / allAge.size
        (clazz, avgAge)
    }

    clazzAvgAgeRDD.foreach(println)

    while (true){

    }


  }

}
